rm(list=ls(all=TRUE))
###############################
#### Directories
###############################
directory <- c("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/")
directory_save <- c("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/c.figures_analysis/")
# directory <- file.choose(). # pick directory where the file is downloaded
raw_dir_proteome="b.raw_data/proteome/"
ciona_stages <- c("unfE","fertE","cell16","iniG","latN","midTII","latTII","larva")
ciona_stages_beautify <- c("unfertilised","1-cell","16-cell","110-cell","late neurula","mid tailbud","late tailbud","larva")
###############################
#### Themes and colors
###############################
ciona_stages_palette <- c("#009392FF", "#39B185FF", "#9CCB86FF", "#E9E29CFF", "#EEB479FF", "#E88471FF", "#CF597EFF", "#A22E3B")
ciona_cluster_palette <- c("#88A0DCFF", "#381A61FF", "#7C4B73FF", "#ED968CFF", "#AB3329FF", "#E78429FF", "#F9D14AFF", "#7FA074FF") # > paletteer::paletteer_d("MetBrewer::Archambault") + MetBrewer::Cassatt2" - Colorblind-Friendly
###############################
#### Functions
###############################
# Parallelel plot
function_ggparallel <- function(data, col_names, group_col, plot_title, alpha_lines = 1, show_points = TRUE, plot_scale = "globalminmax", ...) {
# Plot data
plot <- ggparcoord(data = data,
columns = col_names,
groupColumn = group_col,
title = plot_title,
alphaLines = alpha_lines,
showPoints = show_points,
scale = plot_scale,
...)
plot <- plot +
theme_pubr(base_family = "Avenir") +
labs(x="") +
theme(legend.position = "bottom",
legend.text = element_text(size=15)) # Remove legend
return(plot)
}
function_plotGuides = guides(
color = "none",
size = guide_legend(title="Proteins number",
title.theme = element_text(size = 15,face = "bold"),
direction = "horizontal",
title.position = "top",
nrow=1))
# function for cluster plots
plot_cluster_norm <- function(df, title) {
df %>%
pivot_longer(cols = c(1:8), names_to = "stage", values_to = "expression") %>%
mutate(stage = str_remove(stage, "_byRowCol")) %>%
mutate(stage = fct_relevel(stage, ciona_stages), cluster = factor(cluster)) %>%
ggplot(aes(stage, expression, color = cluster)) +
geom_point() +
geom_line(aes(group = interaction(cluster, size), size = as.factor(size))) +
scale_color_manual(values = ciona_cluster_palette) +
theme_pubr(base_family = "Avenir") +
theme(legend.position = "bottom", plot.title = element_text(size = 10)) +
labs(title = title, x = "Developmental stages", y="Relative expression") +
scale_size_discrete(name = "Cluster size") # rename the size legend
}
# function for cluster pivot
cluster_pivot_data <- function(data, cluster, cols) {
data %>%
filter(.cluster == cluster) %>%
dplyr::select(all_of(cols)) %>%
pivot_longer(-1) %>%
mutate(name = case_when(name == "unfE_byRowCol" ~ 1, name == "fertE_byRowCol" ~ 2, name == "cell16_byRowCol" ~ 3, name == "iniG_byRowCol" ~ 4, name == "latN_byRowCol" ~ 5, name == "midTII_byRowCol" ~ 6, name == "latTII_byRowCol" ~ 7, name == "larva_byRowCol" ~ 8))
}
# function to linear regression
plot_cluster_lr <- function(data, title, norm, tag) {
ggplot(data, aes(x = name, y = value)) +
geom_point() +
ylim(0, 8) +
theme_pubr(base_family = "Avenir") +
geom_smooth(method = "lm", se = FALSE) +
labs(title = title,
subtitle = paste(
"Adj R^2 = ", signif(summary(lm(value ~ name, data = data))$adj.r.squared, 5),
"Intercept = ", signif(coef(lm(value ~ name, data = data))[1], 5),
"Slope = ", signif(coef(lm(value ~ name, data = data))[2], 5), size = 5),
tag= tag,
y="Relative expression",
x="Stage") +
if(norm) { scale_y_continuous(labels = function(x) format(x, scientific = FALSE, big.mark = ","))}
}
# check HGNCmitocarta results with plot: individual proteins
plot_HGNCmitocarta_dynamics <- function(from, to) {
ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
rename_with(.fn = ~str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
ungroup() %>%
slice(from:to) %>%
pivot_longer(cols = c(4:11),
names_to = "stage",
values_to = "expression") %>%
mutate(stage = fct_relevel(stage, ciona_stages)) %>%
ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
geom_point(aes(group = Protein_Id_transcript)) +
geom_line(aes(group = Protein_Id_transcript)) +
facet_wrap(~Protein_Id_transcript) +
theme_pubr(legend = "none",base_family = "Avenir") +
ylim(0,6)
}Analysis.Ciona_TMTproC_normalization
1 TMTproC Ciona
We conducted a quantitative complement tandem mass tag (TMTpro) mass spectrometry (MS) analysis (Johnson et al., 2021) of the proteome of Ciona embryos across eight developmental stages, spanning unfertilized eggs, MZT, neurulation, tail elongation, and larval stage. Notably, the use of a multiplexed proteomics method allows for straightforward interpretation of low signal as low protein abundance, presenting a significant advantage over label-free measurements, where the absence of signal may be due to either low abundance or the mass spectrometer’s failure to detect the protein of interest (Pappireddi et al., 2019).
1.0.0.1 Loading libraries
1.0.0.2 Functions and misc
1.0.0.3 Loading data
###############################
#### Loading the data
###############################
ciona_TMTproC_prenormal.df <-
as_tibble(
read_tsv(file = paste0(directory, raw_dir_proteome, "ky21_w_oxmet_protein_quant_pre.normalisation.tsv"), col_names = T))
# cleaning up the names
lookup <- c("unfE" = "TMTPro126", "fertE" = "TMTPro128C", "cell16" = "TMTPro129N", "iniG" = "TMTPro130C", "latN" = "TMTPro131N", "midTII" = "TMTPro131C", "latTII" = "TMTPro133C", "larva" = "TMTPro134N")
ciona_TMTproC_prenormal.df <-
ciona_TMTproC_prenormal.df %>%
rename(all_of(lookup)) %>% # Rename the column
filter(!grepl("_contaminant", Protein_Id, ignore.case = TRUE)) %>% # exclude human/pig/TEV contaminant, 42 entries
filter(!grepl("##", Protein_Id, ignore.case = TRUE)) %>% # exclude scramble input for FDR correction
dplyr::select(-c("Gene_Symbol", "Description", "Group_ID", "Collapsed", "Parsimony", "default~ratios0 sum", "default~ratios9 sum")) %>%
mutate(Protein_Id_Gene = gsub(pattern = "\\.v\\S*\\..*", replacement = "", Protein_Id)) %>% # transcript to gene ID
rename(Protein_Id_transcript = Protein_Id) %>%
arrange(Protein_Id_transcript) %>%
relocate(Protein_Id_transcript, Protein_Id_Gene, everything())
glimpse(ciona_TMTproC_prenormal.df)Rows: 7,095
Columns: 11
$ Protein_Id_transcript <chr> "KY21.Chr1.10.v1.nonSL2-1", "KY21.Chr1.1000.v1.SL1-1", "KY21.Chr1.1001.v1.SL1-1", "KY21.Chr1.1002.v1.SL1-1", "KY21.Chr1.1004.v1.nonSL4-1", "KY21.Chr1.1005.v1.ND1-1", "KY21.Chr1.1006.v1.SL1-1", "KY21.Chr1.1009.v1.ND1-1", "KY21.Chr1.101.v1.SL1-1", "KY21.Chr1.1010.v1.nonSL1-1", "KY21.Chr1.1011.v1.SL1-1", "KY21.Chr1.1012.v1.SL1-1", "KY21.Chr1.1013.v1.ND1-1", "KY21.Chr1.1014.v2.ND2-1", "KY21.Chr1.1015.v1.SL1-1", "KY21.Chr1.1016.v1.SL1-1", "KY21.Chr1.1018.v1.nonSL1-1", "KY21.Chr1.1019.v1.SL1-1", "KY21.Chr1.102.v1.nonSL2-1", "KY21.Chr1.1020.v1.SL1-1", "KY21.Chr1.1022.v1.ND1-1", "KY21.Chr1.1024.v1.SL1-1", "KY21.Chr1.1027.v2.nonSL3-1", "KY21.Chr1.1029.v1.SL1-1", "KY21.Chr1.1031.v1.ND1-1", "KY21.Chr1.1035.v1.SL1-1", "KY21.Chr1.1040.v1.nonSL7-1", "KY21.Chr1.1041.v1.nonSL2-1", "KY21.Chr1.1042.v1.nonSL4-1", "KY21.Chr1.1045.v1.SL1-1", "KY21.Chr1.1046.v1.SL1-1", "KY21.Chr1.1050.v1.SL1-1", "KY21.Chr1.1051.v1.SL1-1", "KY21.Chr1.1055.v1.SL1-1", "KY21.Chr1.1056.v1.nonSL3-1…
$ Protein_Id_Gene <chr> "KY21.Chr1.10", "KY21.Chr1.1000", "KY21.Chr1.1001", "KY21.Chr1.1002", "KY21.Chr1.1004", "KY21.Chr1.1005", "KY21.Chr1.1006", "KY21.Chr1.1009", "KY21.Chr1.101", "KY21.Chr1.1010", "KY21.Chr1.1011", "KY21.Chr1.1012", "KY21.Chr1.1013", "KY21.Chr1.1014", "KY21.Chr1.1015", "KY21.Chr1.1016", "KY21.Chr1.1018", "KY21.Chr1.1019", "KY21.Chr1.102", "KY21.Chr1.1020", "KY21.Chr1.1022", "KY21.Chr1.1024", "KY21.Chr1.1027", "KY21.Chr1.1029", "KY21.Chr1.1031", "KY21.Chr1.1035", "KY21.Chr1.1040", "KY21.Chr1.1041", "KY21.Chr1.1042", "KY21.Chr1.1045", "KY21.Chr1.1046", "KY21.Chr1.1050", "KY21.Chr1.1051", "KY21.Chr1.1055", "KY21.Chr1.1056", "KY21.Chr1.1057", "KY21.Chr1.1058", "KY21.Chr1.1060", "KY21.Chr1.1061", "KY21.Chr1.1062", "KY21.Chr1.1063", "KY21.Chr1.1066", "KY21.Chr1.107", "KY21.Chr1.1072", "KY21.Chr1.1073", "KY21.Chr1.1077", "KY21.Chr1.1078", "KY21.Chr1.108", "KY21.Chr1.1085", "KY21.Chr1.1088", "KY21.Chr1.1089", "KY21.Chr1.109", "KY21.Chr1.1093", "KY21.Chr1.1095", "KY…
$ Number_of_peptides <dbl> 5, 13, 15, 6, 21, 9, 1, 12, 9, 1, 2, 24, 5, 25, 1, 11, 3, 11, 2, 2, 2, 10, 7, 2, 3, 8, 2, 1, 8, 4, 8, 17, 6, 14, 3, 13, 1, 4, 23, 3, 5, 13, 9, 34, 7, 11, 1, 2, 2, 17, 12, 1, 12, 2, 39, 5, 6, 2, 4, 5, 5, 4, 7, 4, 30, 8, 2, 17, 7, 142, 23, 1, 1, 7, 6, 14, 15, 5, 5, 4, 13, 9, 4, 7, 10, 3, 17, 10, 10, 1, 5, 1, 6, 6, 54, 4, 14, 32, 5, 19, 3, 1, 8, 10, 1, 5, 10, 4, 2, 2, 25, 2, 5, 64, 3, 5, 7, 10, 1, 1, 2, 5, 9, 26, 6, 22, 5, 8, 2, 1, 4, 27, 1, 1, 33, 19, 1, 13, 11, 1, 4, 4, 2, 15, 2, 20, 8, 24, 3, 2, 3, 4, 24, 15, 5, 3, 1, 1, 6, 1, 3, 24, 19, 9, 1, 26, 7, 8, 5, 1, 42, 11, 1, 13, 2, 4, 29, 22, 3, 7, 6, 2, 1, 13, 19, 3, 28, 8, 1, 4, 15, 9, 3, 25, 11, 7, 4, 14, 2, 9, 6, 7, 8, 10, 3, 5, 6, 1, 6, 1, 3, 1, 20, 16, 28, 4, 2, 3, 11, 1, 18, 2, 2, 3, 6, 6, 9, 2, 10, 8, 4, 2, 1, 1, 4, 4, 3, 7, 1, 5, 24, 1, 5, 2, 23, 3, 2, 15, 18, 17, 3, 15, 2, 1, 18, 2, 4, 7, 4, 13, 9, 10, 1, 21, 2, 15, 8, 2, 2, 10, 8, 3, 4, 29, 10, 4, 10, 38, 3, 3, 4, 3, 12, 12, 4, 53, 15, 3, 4, 1, 8, …
$ unfE <dbl> 6.79550e-01, 1.80478e+00, 1.77061e+00, 2.66736e+00, 2.49546e+00, 1.18459e+00, 2.12080e-01, 2.15231e+00, 9.83251e-01, 2.93230e-01, 1.81664e-01, 2.67143e+00, 6.81070e-01, 3.21492e+00, 1.40200e-01, 1.19374e+00, 3.10652e-01, 1.16788e+00, 8.01760e-02, 3.84460e-01, 1.16219e-01, 1.41303e+00, 7.58105e-01, 2.03191e-01, 3.69990e-01, 6.47148e-01, 1.62202e-01, 2.97860e-01, 1.16880e+00, 3.64094e+00, 4.94889e-01, 2.19138e+00, 5.69127e-01, 1.96017e+00, 4.12340e-01, 2.42170e+00, 1.33990e-01, 3.06282e-01, 2.02815e+00, 3.20270e-01, 8.49990e-02, 1.45603e+00, 9.73567e-01, 4.39201e+00, 3.04738e-01, 1.29174e+00, 8.45240e-01, 9.60280e-02, 2.95710e-01, 1.86714e+00, 1.45689e+00, 8.31570e-03, 1.00654e+00, 1.60516e-01, 4.50932e+00, 2.96806e-01, 7.19776e-01, 2.56220e-01, 1.56429e-01, 6.51930e-01, 6.41110e-01, 5.46610e-01, 8.60690e-01, 2.85782e-01, 3.97738e+00, 8.18631e-01, 4.01320e-01, 2.18791e+00, 6.97508e-01, 1.92738e+01, 5.85748e-01, 1.95630e-01, 2.70560e-02, 2.27420e+00, 6.7773…
$ fertE <dbl> 0.57898000, 1.61365000, 1.47541000, 2.22937000, 2.12542000, 1.04231000, 0.16163000, 1.79585000, 0.83099500, 0.28886000, 0.15184300, 2.39866000, 0.51786800, 2.79024000, 0.11953000, 1.07551000, 0.34142300, 0.99176000, 0.06759600, 0.24660000, 0.10924900, 1.32116000, 0.65158100, 0.15684400, 0.29204400, 0.51322100, 0.15263800, 0.07427600, 0.95676400, 0.22614100, 0.54806600, 1.91668000, 0.33684600, 1.65419000, 0.33958000, 2.13430000, 0.10245000, 0.26434300, 1.88775000, 0.29613000, 0.06304520, 1.34814000, 0.89273200, 3.75227000, 0.26859700, 1.10863000, 0.05636300, 0.10972500, 0.22016300, 1.75429000, 1.14763000, 0.14042000, 0.80252200, 0.14261900, 3.81532000, 0.24705600, 0.64934400, 0.22233000, 0.28445900, 0.57858800, 0.58369100, 0.47278000, 0.77612000, 0.30220000, 3.58440000, 0.76841400, 0.20288900, 1.84208000, 0.81967000, 16.18490000, 0.62608200, 0.58697000, 0.10912000, 0.77289200, 0.58463500, 1.66089000, 1.53299000, 0.51176500, 0.37952900, 0.43241300, 1.30337…
$ cell16 <dbl> 6.30810e-01, 1.74379e+00, 1.84235e+00, 3.92345e-01, 2.61606e+00, 1.54636e+00, 2.81570e-02, 2.01416e+00, 9.19726e-01, 4.99240e-02, 1.96029e-01, 2.64079e+00, 5.92140e-01, 3.23076e+00, 1.21340e-01, 1.34959e+00, 4.17450e-01, 1.27442e+00, 1.59875e-01, 2.46230e-01, 2.00173e-01, 1.22588e+00, 8.85627e-01, 2.39350e-01, 3.57280e-01, 8.26071e-01, 1.94077e-01, 9.31880e-02, 1.02497e+00, 4.03112e-02, 1.14933e+00, 2.36967e+00, 6.11220e-01, 1.99817e+00, 4.15720e-01, 1.85064e+00, 1.27860e-01, 3.48907e-01, 2.67077e+00, 3.12358e-01, 5.07407e-02, 1.58694e+00, 1.20904e+00, 4.44341e+00, 3.90184e-01, 1.31081e+00, 4.67240e-04, 1.69883e-01, 2.13952e-01, 2.15117e+00, 1.54993e+00, 5.47300e-02, 1.12241e+00, 1.93090e-01, 4.62140e+00, 3.23287e-01, 7.08080e-01, 2.98280e-01, 2.00552e-01, 6.54220e-01, 6.74670e-01, 6.17570e-01, 9.20190e-01, 4.96919e-01, 3.99285e+00, 9.98534e-01, 4.39920e-01, 2.23781e+00, 8.37260e-01, 1.84059e+01, 1.26432e+00, 7.63360e-02, 7.70780e-02, 1.11694e+00, 7.0421…
$ iniG <dbl> 5.66530e-01, 1.40743e+00, 1.66590e+00, 8.85543e-02, 2.41560e+00, 1.25660e+00, 2.28600e-02, 1.09708e+00, 1.04635e+00, 4.16140e-02, 2.22320e-01, 2.66642e+00, 4.98090e-01, 3.00702e+00, 1.45100e-01, 1.23986e+00, 3.92330e-01, 1.35977e+00, 1.82828e-01, 1.73323e-01, 3.41980e-01, 6.05508e-01, 7.58633e-01, 2.23890e-01, 2.58228e-01, 9.31983e-01, 2.46000e-01, 4.29070e-02, 8.93905e-01, 3.44082e-03, 9.58860e-01, 1.84328e+00, 5.95189e-01, 1.82242e+00, 3.33750e-01, 1.41622e+00, 1.31940e-01, 4.51420e-01, 2.26438e+00, 3.32360e-01, 4.50053e-02, 1.68475e+00, 1.10516e+00, 3.85193e+00, 5.11094e-01, 1.07302e+00, 5.28620e-04, 2.51760e-01, 1.86955e-01, 2.08841e+00, 1.48001e+00, 1.38720e-01, 1.00068e+00, 2.70140e-01, 4.35132e+00, 8.75432e-02, 6.27003e-01, 2.27230e-01, 1.79769e-01, 5.23063e-01, 5.63302e-01, 6.11400e-01, 7.82620e-01, 6.78410e-01, 3.64809e+00, 1.19590e+00, 1.70195e-01, 1.59778e+00, 7.69533e-01, 1.61785e+01, 1.57272e+00, 6.54920e-02, 1.65220e-01, 1.15200e+00, 6.7515…
$ latN <dbl> 6.56020e-01, 1.60770e+00, 1.92901e+00, 1.48615e-01, 2.84228e+00, 1.15077e+00, 1.41600e-01, 1.24657e+00, 1.23647e+00, 5.97930e-02, 2.87670e-01, 3.11005e+00, 6.75230e-01, 3.52794e+00, 1.33690e-01, 1.46253e+00, 3.92100e-01, 1.55889e+00, 2.93490e-01, 2.01361e-01, 4.45370e-01, 8.74752e-01, 8.09143e-01, 2.62910e-01, 4.10610e-01, 1.09934e+00, 2.50590e-01, 1.83880e-02, 1.02141e+00, 2.00058e-03, 1.26086e+00, 2.12950e+00, 7.99010e-01, 2.13169e+00, 4.19640e-01, 1.45474e+00, 1.41710e-01, 5.24710e-01, 3.08728e+00, 4.23560e-01, 1.46835e+00, 1.96051e+00, 1.19747e+00, 4.43640e+00, 1.06858e+00, 1.43388e+00, 5.32300e-04, 3.27810e-01, 1.89183e-01, 2.30298e+00, 1.59574e+00, 7.38420e-02, 1.53803e+00, 3.23490e-01, 5.04323e+00, 1.04762e-01, 8.44640e-01, 2.39760e-01, 1.57163e-01, 6.84430e-01, 6.94280e-01, 6.07570e-01, 8.23100e-01, 6.68700e-01, 4.15992e+00, 1.31243e+00, 1.50844e-01, 1.99327e+00, 8.42364e-01, 1.75978e+01, 2.85044e+00, 5.03780e-02, 1.42170e-01, 9.48280e-01, 8.2858…
$ midTII <dbl> 0.51649100, 1.47614000, 1.86640000, 0.14455300, 2.73108000, 1.00493000, 0.13914000, 1.19404000, 1.15203000, 0.05498100, 0.26129000, 3.13778000, 0.61660000, 3.08246000, 0.11558000, 1.39207000, 0.38579000, 1.39137000, 0.35927000, 0.27412000, 0.30976000, 1.06056000, 0.82864000, 0.28930000, 0.39583000, 1.10481000, 0.23575000, 0.06784000, 0.88612100, 0.00320937, 1.12370000, 1.84402000, 0.79298300, 1.58788000, 0.33307000, 1.19009000, 0.12321000, 0.59921000, 3.08591000, 0.38369000, 1.07596000, 1.66018000, 1.16994000, 4.11350000, 1.30775000, 1.49254000, 0.00027092, 0.31506000, 0.21978600, 2.24189000, 1.41341000, 0.10162000, 1.78241000, 0.34303000, 5.25463000, 0.33050300, 0.75703000, 0.20418100, 0.29036500, 0.60226000, 0.58384600, 0.43159000, 0.67521200, 0.61858000, 3.64306000, 1.22484000, 0.19632100, 1.80701000, 0.90437000, 16.52180000, 4.15660000, 0.02449100, 0.14110000, 0.37511500, 0.77145000, 1.70249000, 1.65501000, 0.58259000, 0.79794000, 0.46438000, 1.61223…
$ latTII <dbl> 0.71847000, 1.81019000, 2.29295000, 0.19764400, 2.99856000, 0.95946700, 0.15448000, 1.39381000, 1.50655000, 0.13078000, 0.36377000, 3.98064000, 0.73932000, 3.19899000, 0.11578000, 1.73527000, 0.39115000, 1.74421000, 0.40071000, 0.24575000, 0.28959000, 1.66706000, 1.11664000, 0.32713000, 0.45108000, 1.43025000, 0.41443000, 0.13884000, 1.06728000, 0.00861615, 1.43945000, 2.44021000, 0.94058300, 1.60715000, 0.38654000, 1.46237000, 0.11509000, 0.70846000, 4.22869000, 0.49455000, 1.10235000, 1.72177000, 1.28422000, 4.71681000, 2.00193000, 1.79324000, 0.04356600, 0.38657000, 0.34050000, 2.29164000, 1.71192000, 0.24534000, 2.34237000, 0.31987000, 5.60966000, 1.32448000, 0.86305000, 0.26963000, 1.01290000, 0.65483000, 0.65851000, 0.40867400, 1.10525000, 0.51764000, 3.90885000, 1.04650000, 0.26919000, 2.76587000, 1.04617000, 20.00400000, 5.74928000, 0.00034559, 0.19598000, 0.22023100, 0.89548000, 1.21065000, 2.35680000, 0.76510000, 0.65684500, 0.51234000, 1.74198…
$ larva <dbl> 0.65318000, 1.53632000, 2.15739000, 0.13155000, 2.77555000, 0.85500400, 0.14004000, 1.10616000, 1.32461000, 0.08081900, 0.33542000, 3.39425000, 0.67965500, 2.94764000, 0.10878000, 1.55143000, 0.36908900, 1.51174000, 0.45606000, 0.22818000, 0.18767700, 1.83204000, 1.19164000, 0.29738000, 0.46495000, 1.44716000, 0.34432000, 0.26671000, 0.98072000, 0.07533700, 1.02489000, 2.26527000, 1.35504000, 1.23834000, 0.35935000, 1.06997000, 0.12374000, 0.79668000, 3.74706000, 0.43708000, 1.10956000, 1.58167000, 1.16787000, 4.29374000, 1.14713000, 1.49614000, 0.05302900, 0.34318000, 0.33375000, 2.30244000, 1.64451000, 0.23702000, 2.40503000, 0.24724000, 5.79510000, 2.28556000, 0.83109000, 0.28237000, 1.71836000, 0.65069000, 0.60056000, 0.30379800, 1.05683000, 0.43174600, 3.08550000, 0.63474200, 0.16932400, 2.56829000, 1.08313000, 17.83330000, 6.19484000, 0.00035054, 0.14228000, 0.14032400, 0.86275000, 0.81396900, 1.99099000, 0.64302500, 0.61435000, 0.43833100, 1.72909…
# ciona_Number_of_peptides.df <- ciona_TMTproC_prenormal.df[,1:3]
vis_dat(ciona_TMTproC_prenormal.df)vis_miss(ciona_TMTproC_prenormal.df)miss_case_summary(ciona_TMTproC_prenormal.df)# A tibble: 7,095 × 3
case n_miss pct_miss
<int> <int> <dbl>
1 1 0 0
2 2 0 0
3 3 0 0
4 4 0 0
5 5 0 0
6 6 0 0
7 7 0 0
8 8 0 0
9 9 0 0
10 10 0 0
# ℹ 7,085 more rows
# 7095 protein entries1.0.1 Standard Normalisation
1.0.1.0.1 Normalisation step using rowMean 1st follow by columnMedian
###############################
#### Normalise each row by its average and then by the median columns
###############################
ciona_TMTproC_rowMean.df <-
ciona_TMTproC_prenormal.df %>%
rowwise() %>% # row-wise operations
mutate(row_mean = rowMeans(pick(c(4:11)))) %>% # calculate mean by row
mutate(across(c(4:11), ~ . / row_mean)) %>% # normalise by dividing each row by its mean
dplyr::select(-row_mean) %>% # remove the row_mean column
rename_at(c(4:11), ~ paste0(., "_by_rowMean")) # rename columns with _by_rowMean suffix
ciona_TMTproC_colMedian_normVector <-
ciona_TMTproC_rowMean.df %>%
ungroup() %>%
summarize(across(.cols = 4:11, .fns = median)) # summarize columns median
# # Double checking results
# col_medians <- apply(ciona_TMTproC_rowMean.df[,-c(1:3)], 2, median)
# col_medians_tbl <- as_tibble(t(col_medians))
# normalised by column median and cleaning up the names
ciona_TMTproC_rowMean_colMedian.df <-
ciona_TMTproC_rowMean.df %>%
dplyr::select(c(4:11)) %>%
map2_dfc(ciona_TMTproC_colMedian_normVector, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
rename_with(
.fn = ~ str_replace(., "_by_rowMean$", "_byRowCol"),
.cols = ends_with("_by_rowMean")
) %>%
bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal")
# write.table(ciona_TMTproC_rowMean_colMedian.df, "normalized_counts_notGood_TMTproC_ciona.txt", quote=F, col.names=T, row.names=F, sep="\t")
# Plotting raw data
plot_original_ciona <-
ciona_TMTproC_prenormal.df %>%
pivot_longer(cols = -c(1:3), names_to = "stage", names_transform = as.factor) %>%
ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +
geom_boxplot() +
scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
labs(title = "Raw data", y = "Developmental stages", x = " Relative abundance") +
theme_pubr(base_family = "Avenir") +
theme(legend.position = "none")
# Plotting normalised data
plot_normRowCol_ciona <-
ciona_TMTproC_rowMean_colMedian.df %>%
pivot_longer(-c(1:3), names_to = "stage", names_transform = as.factor) %>%
filter(str_detect(stage, "_byRowCol")) %>%
mutate(stage = str_remove(stage, "_byRowCol")) %>%
ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +
geom_boxplot() +
scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
labs(title = "Normalised not good", y = "Developmental stages", x = " Relative abundance") +
theme_pubr(base_family = "Avenir") +
theme(legend.position = "none")
plot_original_ciona + plot_normRowCol_ciona1.0.1.0.2 Assessing the normalisation step
CLuster 6 contains protein related to ribosomes, mitochondria, and housekeeping genes. This cluster’s expression decreases over time. Try a different normalization method to emove this technical variation and enhance the biological signals.
###############################
#### Evaluating the normalization process through k-means clustering.
###############################
set.seed(777) # reproduce the clusters
# Fit and predict clusters with k = 8
ciona_kmeans <- kmeans(ciona_TMTproC_rowMean_colMedian.df[, 4:11], centers = 8, nstart = 100, iter.max = 1000)
glance(ciona_kmeans) # extracts a single-row summary: totss, tot.withinss of clustering# A tibble: 1 × 4
totss tot.withinss betweenss iter
<dbl> <dbl> <dbl> <int>
1 13307. 3127. 10179. 6
# clean up the clustering and add cluster prediction to the original data set
ciona_results <- augment(ciona_kmeans, ciona_TMTproC_rowMean_colMedian.df)
ciona_results %>% slice_head(n = 5)# A tibble: 5 × 12
Protein_Id_transcript Protein_Id_Gene Number_of_peptides unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol .cluster
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 KY21.Chr1.10.v1.nonSL2-1 KY21.Chr1.10 5 1.21 1.21 1.07 1.02 0.999 0.838 0.942 0.935 6
2 KY21.Chr1.1000.v1.SL1-1 KY21.Chr1.1000 13 1.23 1.29 1.14 0.978 0.942 0.921 0.913 0.845 6
3 KY21.Chr1.1001.v1.SL1-1 KY21.Chr1.1001 15 1.05 1.03 1.04 1.00 0.980 1.01 1.00 1.03 4
4 KY21.Chr1.1002.v1.SL1-1 KY21.Chr1.1002 6 3.95 3.87 0.554 0.133 0.189 0.195 0.216 0.157 2
5 KY21.Chr1.1004.v1.nonSL4-1 KY21.Chr1.1004 21 1.06 1.06 1.06 1.04 1.03 1.05 0.937 0.946 4
# summarizes per-cluster level info eg how many point each cluster and within cluster sum of squares
ciona_cluster_avgSummary <- tidy(ciona_kmeans)
ciona_cluster_avgSummary# A tibble: 8 × 11
unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol size withinss cluster
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
1 0.446 0.462 0.598 0.767 1.06 1.34 1.46 1.60 822 518. 1
2 2.52 2.19 1.26 0.597 0.520 0.490 0.526 0.589 201 484. 2
3 0.534 0.612 0.915 1.45 1.47 1.28 0.928 0.861 405 378. 3
4 0.929 0.939 0.952 1.00 1.02 1.04 1.08 1.11 2429 419. 4
5 7.30 0.604 0.104 0.0796 0.0713 0.0965 0.156 0.390 37 37.5 5
6 1.22 1.23 1.17 1.05 0.995 0.928 0.861 0.824 2602 633. 6
7 0.126 0.0906 0.114 0.0852 0.112 0.455 1.81 4.35 191 218. 7
8 0.260 0.191 0.203 0.189 0.459 1.21 2.15 2.64 408 439. 8
# Visualise dynamics x cluster. Eg cluster 1 should be flatter
function_ggparallel(
data = ciona_cluster_avgSummary %>% rename_with(~ str_remove(., "_byRowCol")),
col_names = 1:8,
group_col = "cluster",
plot_title = "Ciona: cluster center dynamcis"
) +
geom_line(aes(color = cluster, size = as.factor(size))) +
scale_color_manual(values = ciona_cluster_palette) +
theme_pubr(base_family = "Avenir") +
facet_grid(~cluster) +
# Visualise each protein behavior x cluster
function_ggparallel(
data = ciona_results %>% rename_with(~ str_remove(., "_byRowCol")),
col_names = 4:11,
group_col = ".cluster",
plot_title = "Ciona: proteins dynamics x cluster"
) +
# geom_line(aes(color=cluster, size=as.factor(size))) +
scale_color_manual(values = ciona_cluster_palette) +
theme_pubr(base_family = "Avenir") +
facet_grid(~.cluster) +
theme(legend.position = "none") +
plot_layout(ncol = 1) & theme(plot.margin = unit(c(0.5, 0.5, 0, 0.5), "lines"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1)) & theme_pubr(base_family = "Avenir")# ciona_results %>% rename_with(~ str_remove(., "_byRowCol")) %>%
# pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
# mutate(stage = fct_relevel(stage, ciona_stages)) %>%
# ggplot(aes(stage,expression, color = .cluster)) +
# geom_point(aes(group = Protein_Id_transcript)) +
# geom_line(aes(group = Protein_Id_transcript)) +
# scale_color_manual(values = ciona_cluster_palette) +
# facet_wrap(~.cluster) + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))1.0.2 Improving the normalization process
It seems that dividing the protein values by the median column values is not an effective method for normalizing this data because the “flat” proteins are decreasing over time. As an alternative, we try using ribosomal and/or mitochondrial proteins as internal standards to improve the normalization process and better capture the stable proteins.
1.0.2.1 Mitocarta
The mitochondrial protein annotation provided by the Mitocarta database (https://www.broadinstitute.org/mitocarta/mitocarta30-inventory-mammalian-mitochondrial-proteins-and-pathways) includes only Entrez IDs. However, the Ciona gff3 annotation utilizes HGCN IDs. To map between these two annotations, we first use biomart and then select only the mitochondrial proteins present in our TMTcPro dataframe.
1.0.2.1.1 Genes and protein annotation
# Annotating by gff3
ciona_annotation_gff3 <- as_tibble(read.delim(file = paste0(directory, "a.reference_annotation/annotation_ciona.KY21ensembl.109.MT.tsv"), header = TRUE, sep = "\t", dec = ".", strip.white = T)) %>% dplyr::select(1, 2, 10, 11, 12)
glimpse(ciona_annotation_gff3)Rows: 55,205
Columns: 5
$ GeneID <chr> "KY21.Chr13.334", "KY21.Chr11.1168", "KY21.Chr11.1170", "KY21.Chr9.1164", "KY21.Chr7.558", "KY21.Chr11.343", "KY21.Chr12.128", "KY21.Chr2.200", "KY21.Chr3.56", "KY21.Chr1.156", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr9.577", "KY21.Chr9.577", "KY21.Chr12.699", "KY21.Chr12.699", "KY21.Chr9.398", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig32.28", "KY21.UAContig36.22", "KY21.UAContig49.9", "KY21.Chr8.497", "KY21.Chr7.192", "KY21.Chr9.981", "KY21.Chr9.1132", "KY21.Chr13.101", "KY21.Chr7.880", "KY21.Chr10.26", "KY21.Chr1.1883", "KY21.Chr2.1022", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr3.114", "KY21.Chr3.115", "KY21.Chr3.115", "KY21.Chr14.562", "KY21.Chr14.562", "KY21.UAContig2.22", "KY21.Chr1.2027", "KY21.Chr5.488", "KY21.Chr7.427"…
$ TranscriptID <chr> "KY21.Chr13.334.v1.ND1-1", "KY21.Chr11.1168.v2.SL1-1", "KY21.Chr11.1170.v1.SL1-1", "KY21.Chr9.1164.v1.ND1-1", "KY21.Chr7.558.v1.ND1-1", "KY21.Chr11.343.v1.nonSL1-1", "KY21.Chr12.128.v1.ND1-1", "KY21.Chr2.200.v1.SL1-1", "KY21.Chr3.56.v1.ND1-1", "KY21.Chr1.156.v1.SL1-1", "KY21.Chr1.157.v1.SL2-1", "KY21.Chr1.157.v1.SL1-1", "KY21.Chr1.157.v1.nonSL4-1", "KY21.Chr1.157.v1.nonSL6-1", "KY21.Chr1.157.v1.nonSL5-1", "KY21.Chr1.157.v1.nonSL8-1", "KY21.Chr1.157.v1.SL3-1", "KY21.Chr1.157.v1.nonSL7-1", "KY21.Chr9.577.v1.nonSL1-1", "KY21.Chr9.577.v1.nonSL2-1", "KY21.Chr12.699.v1.nonSL1-1", "KY21.Chr12.699.v1.nonSL2-1", "KY21.Chr9.398.v1.SL1-1", "KY21.UAContig13.6.v1.nonSL1-1", "KY21.UAContig13.6.v1.nonSL3-1", "KY21.UAContig13.6.v1.nonSL4-1", "KY21.UAContig13.6.v1.nonSL2-1", "KY21.UAContig13.6.v1.nonSL5-1", "KY21.UAContig32.28.v1.ND1-1", "KY21.UAContig36.22.v1.ND1-1", "KY21.UAContig49.9.v1.ND1-1", "KY21.Chr8.497.v1.SL1-1", "KY21.Chr7.192.v1.ND1-1", "KY21.Chr9.981.v1.ND1-1", "KY…
$ HumanID <chr> "A2M", "AADAC", "AADAC", "AAMP", "AANAT", "AANAT", "AANAT", "AARS1", "AARS1", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABCA10", "ABCA10", "ABCA2", "ABCA2", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA4", "ABCA5", "ABCA5", "ABCA5", "ABCA5", "ABCA8", "ABCA8", "ABCA9", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB10", "ABCB11", "TAP1", "ABCB5", "ABCB6", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB8", "ABCB8", "ABCB8", "ABCB9", "ABCB9", "ABCB9", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "AB…
$ HGNC <chr> "HGNC:7", "HGNC:17", "HGNC:17", "HGNC:18", "HGNC:19", "HGNC:19", "HGNC:19", "HGNC:20", "HGNC:20", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:30", "HGNC:30", "HGNC:32", "HGNC:32", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:34", "HGNC:35", "HGNC:35", "HGNC:35", "HGNC:35", "HGNC:38", "HGNC:38", "HGNC:39", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:41", "HGNC:42", "HGNC:43", "HGNC:46", "HGNC:47", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:49", "HGNC:49…
$ Human_longname <chr> "alpha-2-macroglobulin", "arylacetamide deacetylase", "arylacetamide deacetylase", "angio associated migratory cell protein", "aralkylamine N-acetyltransferase", "aralkylamine N-acetyltransferase", "aralkylamine N-acetyltransferase", "alanyl-tRNA synthetase 1", "alanyl-tRNA synthetase 1", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "ATP binding cassette subfamily A member 10", "ATP binding cassette subfamily A member 10", "ATP binding cassette subfamily A member 2", "ATP binding cassette subfamily A member 2", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3",…
# Annotating by HGNC
human_mitocarta <- read_excel(paste0(directory, "a.reference_annotation/Human.MitoCarta3.0.xls"), sheet = "A Human MitoCarta3.0", trim_ws = T, col_names = T) %>%
dplyr::select(1, 3, 5) %>%
mutate(human_mitocarta = "human_mitocarta")
# dplyr::select(1,3,5,15,16) $uniprot lenght and geneID
glimpse(human_mitocarta)Rows: 1,136
Columns: 4
$ HumanGeneID <dbl> 1537, 6390, 10229, 6389, 7384, 84274, 5160, 57017, 6182, 513, 9377, 122961, 9512, 7386, 498, 4967, 5162, 7385, 6392, 60488, 27089, 116540, 1629, 5166, 4191, 23107, 1431, 80273, 1737, 10128, 1743, 8050, 85476, 25874, 4719, 26589, 514, 5250, 51649, 2271, 23203, 506, 135154, 29796, 81689, 8803, 51805, 55699, 64960, 3419, 1353, 2110, 26519, 64981, 51069, 593, 7388, 192286, 539, 1892, 25875, 1337, 1355, 10939, 3030, 2108, 374291, 1376, 594, 3420, 23395, 35, 3954, 521, 4976, 549, 8802, 4729, 51004, 84545, 83451, 522, 4714, 9361, 1738, 9131, 79746, 139322, 124995, 92399, 4728, 34, 10989, 26520, 291, 25813, 4720, 4723, 50, 6832, 2235, 4528, 5245, 26275, 51116, 3313, 6834, 10935, 27069, 60558, 26517, 57128, 54948, 64976, 3421, 6391, 4711, 54949, 93058, 8209, 25828, 55168, 1345, 4715, 23788, 4700, 788, 65008, 10469, 10063, 440574, 6648, 1340, 7416, 8192, 3033, 33, 38, 115416, 10845, 4724, 51073, 708, 10531, 55245, 128308, 79922, 51102, 65080, 84263, 63931, 11194, 91647,…
$ Symbol <chr> "CYC1", "SDHB", "COQ7", "SDHA", "UQCRC1", "COQ5", "PDHA1", "COQ9", "MRPL12", "ATP5F1D", "COX5A", "ISCA2", "PMPCB", "UQCRFS1", "ATP5F1A", "OGDH", "PDHB", "UQCRC2", "SDHD", "MRPS35", "UQCRQ", "MRPL53", "DBT", "PDK4", "MDH2", "MRPS27", "CS", "GRPEL1", "DLAT", "LRPPRC", "DLST", "PDHX", "GFM1", "MPC2", "NDUFS1", "MRPL46", "ATP5F1E", "SLC25A3", "MRPS23", "FH", "PMPCA", "ATP5F1B", "SDHAF4", "UQCR10", "ISCA1", "SUCLA2", "COQ3", "IARS2", "MRPS15", "IDH3A", "COX11", "ETFDH", "TIMM10", "MRPL34", "MRPL2", "BCKDHA", "UQCRH", "HIGD2A", "ATP5PO", "ECHS1", "LETMD1", "COX6A1", "COX15", "AFG3L2", "HADHA", "ETFA", "NDUFS7", "CPT2", "BCKDHB", "IDH3B", "LARS2", "ACADS", "LETM1", "ATP5ME", "OPA1", "AUH", "SUCLG1", "NDUFV2", "COQ6", "MRPL43", "ABHD11", "ATP5PF", "NDUFB8", "LONP1", "DLD", "AIFM1", "ECHDC3", "APOOL", "MRPL10", "MRRF", "NDUFS8", "ACADM", "IMMT", "TIMM9", "SLC25A4", "SAMM50", "NDUFS2", "NDUFV1", "ACO2", "SUPV3L1", "FECH", "MTIF2", "PHB", "HIBCH", "MRPS2", "HSPA9", "SURF…
$ Description <chr> "cytochrome c1", "succinate dehydrogenase complex iron sulfur subunit B", "coenzyme Q7, hydroxylase", "succinate dehydrogenase complex flavoprotein subunit A", "ubiquinol-cytochrome c reductase core protein 1", "coenzyme Q5, methyltransferase", "pyruvate dehydrogenase E1 subunit alpha 1", "coenzyme Q9", "mitochondrial ribosomal protein L12", "ATP synthase F1 subunit delta", "cytochrome c oxidase subunit 5A", "iron-sulfur cluster assembly 2", "peptidase, mitochondrial processing subunit beta", "ubiquinol-cytochrome c reductase, Rieske iron-sulfur polypeptide 1", "ATP synthase F1 subunit alpha", "oxoglutarate dehydrogenase", "pyruvate dehydrogenase E1 subunit beta", "ubiquinol-cytochrome c reductase core protein 2", "succinate dehydrogenase complex subunit D", "mitochondrial ribosomal protein S35", "ubiquinol-cytochrome c reductase complex III subunit VII", "mitochondrial ribosomal protein L53", "dihydrolipoamide branched chain transacylase E2", "pyruvate dehydr…
$ human_mitocarta <chr> "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "huma…
1.0.2.1.2 Mapping NCBI Entrez Gene ID of Mitocarta to HGNC symbols
###############################
#### Convert the NCBI Entrez Gene IDs from Mitocarta to their corresponding HGNC symbols.
###############################
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # use human genes dataset
# head(listAttributes(ensembl),20) # define the attributes to retrieve (HGNC symbol and Entrez Gene ID)
# split in two to handle the number of attributes
attributes <- c("entrezgene_id", "hgnc_symbol", "hgnc_id", "description")
attributes2 <- c("entrezgene_id", "entrezgene_accession") # "entrezgene_description"
# convert Entrez Gene ID of mitocarta to HGNC symbols
entrez_ids <- human_mitocarta$HumanGeneID # 1136
hgnc_symbols <- getBM(
attributes = attributes,
filters = "entrezgene_id",
values = entrez_ids,
mart = ensembl
)
hgnc_symbols2 <- getBM(
attributes = attributes2,
filters = "entrezgene_id",
values = entrez_ids,
mart = ensembl
)
hgnc_symbols <-
hgnc_symbols %>%
left_join(hgnc_symbols2, by = "entrezgene_id") %>% # Merge the two dfs
relocate(hgnc_id, entrezgene_accession, hgnc_symbol, entrezgene_id, description)
glimpse(hgnc_symbols)Rows: 1,137
Columns: 5
$ hgnc_id <chr> "HGNC:40910", "HGNC:35410", "HGNC:40038", "HGNC:993", "HGNC:994", "HGNC:34528", "HGNC:35428", "HGNC:17312", "HGNC:40045", "HGNC:38844", "HGNC:47", "HGNC:2973", "HGNC:2264", "HGNC:44317", "HGNC:12367", "HGNC:9259", "HGNC:15714", "HGNC:16264", "HGNC:17366", "HGNC:10983", "HGNC:10985", "HGNC:50696", "HGNC:20567", "HGNC:18349", "HGNC:2244", "HGNC:16632", "HGNC:17310", "HGNC:16897", "HGNC:13734", "HGNC:16902", "HGNC:39", "HGNC:12730", "HGNC:1530", "HGNC:6737", "HGNC:17315", "HGNC:83", "HGNC:18001", "HGNC:14601", "HGNC:1329", "HGNC:17316", "HGNC:845", "HGNC:17663", "HGNC:17169", "HGNC:16637", "HGNC:14484", "HGNC:18155", "HGNC:7437", "HGNC:8587", "HGNC:14247", "HGNC:24639", "HGNC:7506", "HGNC:21062", "HGNC:9186", "HGNC:12843", "HGNC:344", "HGNC:7434", "HGNC:3978", "HGNC:2088", "HGNC:6985", "HGNC:8769", "HGNC:16985", "HGNC:24676", "HGNC:9354", "HGNC:315", "HGNC:15746", "HGNC:18431", "HGNC:53111", "HGNC:30862", "HGNC:6047", "HGNC:16429", "HGNC:11713", "HGNC:4907",…
$ entrezgene_accession <chr> "TSTD3", "TSTD1", "PET100", "BCL2L10", "BCL2L11", "TOMM6", "CMC4", "TIMM23", "PET117", "ATP5MF-PTCD1", "ABCB6", "DNM1L", "COX17", "PYURF", "TSFM", "PPIF", "LRPPRC", "TRAP1", "AASS", "SLC25A13", "SLC25A15", "PIGBOS1", "NME6", "DHRS2", "COQ7", "MRPS31", "TIMM17B", "RIDA", "GLYAT", "BCKDK", "ABCA9", "WARS2", "MICU1", "LYPLA1", "TIMM17A", "ACAA2", "TOMM40", "ECI2", "SLC30A9", "TIMM44", "ATP5PD", "PITRM1", "PRDX4", "HTATIP2", "MRPL28", "TXNRD2", "MTHFS", "PAICS", "ATP5MG", "PRELID3A", "MTX2", "FARS2", "POLQ", "YME1L1", "AHCYL1", "MTHFD2", "ALDH1L1", "CLPX", "ME3", "MRPS30", "DHRS4", "FASTK", "PRDX3", "AFG3L2", "TOMM34", "ACOT2", "HTD2", "UQCR11", "IMMT", "LIAS", "TDRKH", "HIBADH", "NUDT6", "NUDT5", "ABCB8", "PLPBP", "AKAP10", "MRPL3", "POLG2", "CA5B", "PXMP4", "RDH13", "FDX2", "HOGA1", "MTFR2", "PARK7", "PHB2", "ACOT7", "SDSL", "LACTB", "SLC25A25", "OSBPL1A", "PTPMT1", "OMA1", "SLC25A26", "MALSU1", "DHRS1", "CKMT1B", "CKMT2", "FAM210B", "COX20", "ACSM1", "TOP1…
$ hgnc_symbol <chr> "TSTD3", "TSTD1", "PET100", "BCL2L10", "BCL2L11", "TOMM6", "CMC4", "TIMM23", "PET117", "ATP5MF-PTCD1", "ABCB6", "DNM1L", "COX17", "PYURF", "TSFM", "PPIF", "LRPPRC", "TRAP1", "AASS", "SLC25A13", "SLC25A15", "PIGBOS1", "NME6", "DHRS2", "COQ7", "MRPS31", "TIMM17B", "RIDA", "GLYAT", "BCKDK", "ABCA9", "WARS2", "MICU1", "LYPLA1", "TIMM17A", "ACAA2", "TOMM40", "ECI2", "SLC30A9", "TIMM44", "ATP5PD", "PITRM1", "PRDX4", "HTATIP2", "MRPL28", "TXNRD2", "MTHFS", "PAICS", "ATP5MG", "PRELID3A", "MTX2", "FARS2", "POLQ", "YME1L1", "AHCYL1", "MTHFD2", "ALDH1L1", "CLPX", "ME3", "MRPS30", "DHRS4", "FASTK", "PRDX3", "AFG3L2", "TOMM34", "ACOT2", "HTD2", "UQCR11", "IMMT", "LIAS", "TDRKH", "HIBADH", "NUDT6", "NUDT5", "ABCB8", "PLPBP", "AKAP10", "MRPL3", "POLG2", "CA5B", "PXMP4", "RDH13", "FDX2", "HOGA1", "MTFR2", "PARK7", "PHB2", "ACOT7", "SDSL", "LACTB", "SLC25A25", "OSBPL1A", "PTPMT1", "OMA1", "SLC25A26", "MALSU1", "DHRS1", "CKMT1B", "CKMT2", "FAM210B", "COX20", "ACSM1", "TOP1…
$ entrezgene_id <int> 100130890, 100131187, 100131801, 10017, 10018, 100188893, 100272147, 100287932, 100303755, 100526740, 10058, 10059, 10063, 100996939, 10102, 10105, 10128, 10131, 10157, 10165, 10166, 101928527, 10201, 10202, 10229, 10240, 10245, 10247, 10249, 10295, 10350, 10352, 10367, 10434, 10440, 10449, 10452, 10455, 10463, 10469, 10476, 10531, 10549, 10553, 10573, 10587, 10588, 10606, 10632, 10650, 10651, 10667, 10721, 10730, 10768, 10797, 10840, 10845, 10873, 10884, 10901, 10922, 10935, 10939, 10953, 10965, 109703458, 10975, 10989, 11019, 11022, 11112, 11162, 11164, 11194, 11212, 11216, 11222, 11232, 11238, 11264, 112724, 112812, 112817, 113115, 11315, 11331, 11332, 113675, 114294, 114789, 114876, 114971, 115209, 115286, 115416, 115817, 1159, 1160, 116151, 116228, 116285, 116447, 116540, 116541, 117145, 118487, 118881, 118980, 119559, 122704, 122961, 123096, 123263, 123876, 124454, 124995, 125170, 125228, 125965, 125988, 126129, 126328, 126789, 127018, 128240, 12830…
$ description <chr> "thiosulfate sulfurtransferase like domain containing 3 [Source:HGNC Symbol;Acc:HGNC:40910]", "thiosulfate sulfurtransferase like domain containing 1 [Source:HGNC Symbol;Acc:HGNC:35410]", "PET100 cytochrome c oxidase chaperone [Source:HGNC Symbol;Acc:HGNC:40038]", "BCL2 like 10 [Source:HGNC Symbol;Acc:HGNC:993]", "BCL2 like 11 [Source:HGNC Symbol;Acc:HGNC:994]", "translocase of outer mitochondrial membrane 6 [Source:HGNC Symbol;Acc:HGNC:34528]", "C-X9-C motif containing 4 [Source:HGNC Symbol;Acc:HGNC:35428]", "translocase of inner mitochondrial membrane 23 [Source:HGNC Symbol;Acc:HGNC:17312]", "PET117 cytochrome c oxidase chaperone [Source:HGNC Symbol;Acc:HGNC:40045]", "ATP5MF-PTCD1 readthrough [Source:HGNC Symbol;Acc:HGNC:38844]", "ATP binding cassette subfamily B member 6 (Langereis blood group) [Source:HGNC Symbol;Acc:HGNC:47]", "dynamin 1 like [Source:HGNC Symbol;Acc:HGNC:2973]", "cytochrome c oxidase copper chaperone COX17 [Source:HGNC Symbol;Acc:HGN…
# ENSG00000230623 not in HGNC
# write.table(hgnc_symbols, "hgnc_symbols_to_match.txt", quote=F, col.names=T, row.names=F, sep="\t")
# human_match_table <- as_tibble(read.delim(file = paste0(directory,"a_reference_annotation/ciona_KY21/hgnc_symbols_to_match.txt"), header = TRUE, sep = "\t", dec = ".",strip.white = T))
# glimpse(human_match_table)
human_mitocarta_HGNC.df <-
human_mitocarta %>%
left_join(hgnc_symbols, multiple = "all", by = c("HumanGeneID" = "entrezgene_id")) %>%
dplyr::select(5, 1, 2, 6, 7) %>%
filter(!(hgnc_id == "HGNC:1995" & HumanGeneID == 1159)) %>%
arrange(hgnc_id) %>%
slice(-(1:3)) # 3 rows are empyty for hgnc_id and HumanGeneID
vis_dat(human_mitocarta_HGNC.df)vis_miss(human_mitocarta_HGNC.df)miss_case_summary(human_mitocarta_HGNC.df)# A tibble: 1,133 × 3
case n_miss pct_miss
<int> <int> <dbl>
1 1 0 0
2 2 0 0
3 3 0 0
4 4 0 0
5 5 0 0
6 6 0 0
7 7 0 0
8 8 0 0
9 9 0 0
10 10 0 0
# ℹ 1,123 more rows
# human_mitocarta_HGNC <- human_mitocarta_HGNC[!apply(human_mitocarta_HGNC == " ", 1, all),]
# diff <- human_mitocarta_HGNC$Symbol == human_mitocarta_HGNC$hgnc_symbol
# diff2 <- human_mitocarta_HGNC$entrezgene_accession == human_mitocarta_HGNC$hgnc_symbol
# human_mitocarta_HGNC[!diff, "entrezgene_accession"] # CKMT1A and CKMT1B are duplicates
# it look good you can drop the common columns
# PRORP HGNC:19958
# protein only RNase P catalytic subunit [Source:HGNC Symbol;Acc:HGNC:19958]
# protein only RNase P catalytic subunit1.0.2.1.3 Visualise mitocarta genes
###############################
#### Visualise and select stable mitochondria proteins
###############################
# Select the mitochondrial proteins present in TMTcPro with less than 2 fold change in relative abundance over the time course.
ciona_TMTproC_rowMean_mitocartaHGNC.df <-
ciona_TMTproC_rowMean.df %>%
left_join(ciona_annotation_gff3 %>% dplyr::select(2:4), by = c("Protein_Id_transcript" = "TranscriptID")) %>%
right_join(human_mitocarta_HGNC.df %>% dplyr::select(1, 5), by = c("HGNC" = "hgnc_id")) %>%
filter_at(vars(4:11), all_vars(. <= 2)) # fold change less than 2 over development
# 2nd filtering step
ciona_rowMean_mitocartaHGNC_flatDynamics.vector <-
ciona_TMTproC_rowMean_mitocartaHGNC.df %>%
rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
rowwise() %>%
transmute(
Protein_Id_transcript = Protein_Id_transcript,
diff_1cell_unfert = abs(`fertE` - unfE),
diff_16cell_1cell = abs(`cell16` - `fertE`),
diff_110cell_16cell = abs(`iniG` - `cell16`),
diff_lateneurula_110cell = abs(`latN` - `iniG`),
diff_midtailbud_lateneurula = abs(`midTII` - `latN`),
diff_latetailbud_midtailbud = abs(`latTII` - `midTII`),
diff_larva_latetailbud = abs(larva - `latTII`)
) %>%
mutate(diff_sum = sum(c_across(starts_with("diff")))) %>% # calculate sum difference
filter(diff_sum < 1) # keep only proteinz with sum less than 2 sum difference
# Filter step 1
toselect_flat <-
ciona_rowMean_mitocartaHGNC_flatDynamics.vector$Protein_Id_transcript
# Filter step 2
ciona_rowMean_mitocartaHGNC_flatDynamics.df <- ciona_TMTproC_rowMean_mitocartaHGNC.df[ciona_TMTproC_rowMean_mitocartaHGNC.df$Protein_Id_transcript %in% toselect_flat, ]
# # lag betwenn stages give the same resutls as above
# df %>%
# rowwise() %>%
# mutate(diff_all = sum(abs(diff(c_across(unfertilised:larva), lag = 1)))) %>%
# filter(diff_all < 1)
HGNCmitocarta.dynamics.1 <- plot_HGNCmitocarta_dynamics(0, 140)
HGNCmitocarta.dynamics.2 <- plot_HGNCmitocarta_dynamics(141, 270)
HGNCmitocarta.dynamics.1 # they look goodHGNCmitocarta.dynamics.2 # they look good# check result with plot: all proteins
ciona_TMTproC_rowMean_mitocartaHGNC.df %>%
ungroup() %>%
rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
mutate(stage = fct_relevel(stage, ciona_stages)) %>%
ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
geom_point(aes(group = Protein_Id_transcript)) +
labs(title = "Some are still too dynamics and need to be exclude for normalization") +
geom_line(aes(group = Protein_Id_transcript)) +
ylim(0, 2) +
theme_pubr(base_family = "Avenir") +
theme(legend.position = "none") +
ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
ungroup() %>%
rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
mutate(stage = fct_relevel(stage, ciona_stages)) %>%
ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
geom_point(aes(group = Protein_Id_transcript)) +
labs(title = "Good for normalization") +
geom_line(aes(group = Protein_Id_transcript)) +
ylim(0, 2) & theme_pubr(base_family = "Avenir",legend = "none")1.0.2.1.4 Normalisation step using rowMean follow by flat df columnMedian
###############################
#### Normalising using the median column of stable mitochondria proteins
###############################
# Median by column of mito protein
ciona_colMedian_normVector_mitocartaHGNC <-
ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
dplyr::select(4:11) %>%
ungroup() %>%
summarize_all(median)
# Normalise by median column
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df <-
ciona_TMTproC_rowMean.df %>%
dplyr::select(4:11) %>%
map2_dfc(ciona_colMedian_normVector_mitocartaHGNC, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal")
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df <-
ciona_TMTproC_rowMean.df %>%
dplyr::select(4:11) %>%
map2_dfc(ciona_colMedian_normVector_mitocartaHGNC, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
rename_with(
.fn = ~ str_replace(., "_by_rowMean$", "_byRowCol"),
.cols = ends_with("_by_rowMean")
) %>%
bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal") %>%
left_join(ciona_annotation_gff3 %>% dplyr::select(2:4), by = c("Protein_Id_transcript" = "TranscriptID")) %>%
relocate(1, 2, 12, 13, 3, everything())1.0.2.1.5 Assessing the mitocarta normalisation step
Normalization look good, with a cluster of protein ( ribosome and mitocondria) that are flat during embryogeneis . eg housekeeping
###############################
#### Plot before after mitocarta only proteins dynamics
###############################
# Before
ciona_avgNorm_std <-
ciona_TMTproC_rowMean_colMedian.df %>%
filter(Protein_Id_transcript %in% toselect_flat) %>%
# ungroup() %>%
summarize(across(c(4:11), median)) %>%
pivot_longer(everything()) %>%
mutate(
name = str_remove(name, "_byRowCol"),
name = case_when(
name == "unfE" ~ 1,
name == "fertE" ~ 2,
name == "cell16" ~ 3,
name == "iniG" ~ 4,
name == "latN" ~ 5,
name == "midTII" ~ 6,
name == "latTII" ~ 7,
name == "larva" ~ 8
)
)
# After
ciona_avgNorm_mitocarta <-
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
filter(Protein_Id_transcript %in% toselect_flat) %>%
summarize(across(c(6:13), median)) %>%
pivot_longer(everything()) %>%
mutate(
name = str_remove(name, "_byRowCol"),
name = case_when(
name == "unfE" ~ 1,
name == "fertE" ~ 2,
name == "cell16" ~ 3,
name == "iniG" ~ 4,
name == "latN" ~ 5,
name == "midTII" ~ 6,
name == "latTII" ~ 7,
name == "larva" ~ 8
)
)
function_ggparallel(
data = ciona_TMTproC_rowMean_colMedian.df %>%
filter(Protein_Id_transcript %in% toselect_flat) %>% rename_with(~ str_remove(., "_byRowCol")),
col_names = 4:11,
group_col = "Protein_Id_transcript",
plot_title = "Ciona: std norm prot dynamcis"
) +
theme(legend.position = "none") +
ggplot(ciona_avgNorm_std, aes(x = name, y = value)) +
geom_point() +
theme_pubr(base_family = "Avenir") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
ggtitle("Linear Regression of Stage vs. Value") +
xlab("Stage") +
ylab("Value") +
function_ggparallel(
data = ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
filter(Protein_Id_transcript %in% toselect_flat) %>% rename_with(~ str_remove(., "_byRowCol")),
col_names = 6:13,
group_col = "Protein_Id_transcript",
plot_title = "Ciona: std mito prot dynamcis"
) +
theme(legend.position = "none") +
ggplot(ciona_avgNorm_mitocarta, aes(x = name, y = value)) +
# ggplot(ciona_avgNorm_mitocarta, aes(x = as.numeric(sub("Stage ", "", name)), y = value)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
ggtitle("Linear Regression of Stage vs. Value") +
theme_pubr(base_family = "Avenir") +
xlab("Stage") +
ylab("Value") & theme_pubr(legend = "none",base_family = "Avenir")# Save the normalised TMTproC data for Ciona
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
rename_with(~ str_remove(., "_byRowCol")) %>%
rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") %>%
write.table(., paste0(directory_save, "/ciona_proteome_norm.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")1.0.2.1.6 Same results as above
# # divide each column of df by the corresponding column of vector median
# norm_ciona_mitocarta <- ciona_TMTproC_rowMean.df
# norm_ciona_mitocarta[, 4:11] <- mapply('/', norm_ciona_mitocarta[, 4:11], ciona_colMedian_normVector_mitocartaHGNC)
#
# norm_ciona_mitocarta
# ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df1.0.2.1.7 Plotting mitocarta normalisation
###############################
#### Kmeans dynamics of Ciona protein dataset with mitocarta normalisation
###############################
set.seed(777) # reproduce the clusters
# Fit and predict clusters with k = 8
ciona_kmeans_mito <- kmeans(ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df[,6:13], centers = 8, nstart = 100, iter.max = 1000)
# clean up the clustering and add cluster prediction to the original data set
ciona_results_mito <- augment(ciona_kmeans_mito, ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df)
ciona_results_mito %>% slice_head(n = 5)# A tibble: 5 × 14
Protein_Id_transcript Protein_Id_Gene HumanID HGNC Number_of_peptides unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol .cluster
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 KY21.Chr1.10.v1.nonSL2-1 KY21.Chr1.10 TAF9 HGNC:11542 5 1.04 1.01 1.00 0.961 1.00 0.863 1.06 1.07 1
2 KY21.Chr1.1000.v1.SL1-1 KY21.Chr1.1000 PSMB7 HGNC:9544 13 1.06 1.08 1.06 0.918 0.943 0.948 1.03 0.970 1
3 KY21.Chr1.1001.v1.SL1-1 KY21.Chr1.1001 LRRC47 HGNC:29207 15 0.902 0.856 0.974 0.942 0.981 1.04 1.13 1.18 1
4 KY21.Chr1.1002.v1.SL1-1 KY21.Chr1.1002 TDRD1 HGNC:11712 6 3.40 3.23 0.519 0.125 0.189 0.201 0.243 0.180 3
5 KY21.Chr1.1004.v1.nonSL4-1 KY21.Chr1.1004 DNAJC2 HGNC:13192 21 0.908 0.881 0.988 0.976 1.03 1.09 1.05 1.08 1
# summarizes per-cluster level info eg how many point each cluster and within cluster sum of squares
ciona_cluster_avg_summary_mito <- tidy(ciona_kmeans_mito)
ciona_cluster_avg_summary_mito# A tibble: 8 × 11
unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol size withinss cluster
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
1 1.01 0.983 1.06 0.993 1.01 0.979 0.999 0.981 3193 572. 1
2 0.748 0.730 0.847 0.910 1.01 1.10 1.28 1.37 1998 430. 2
3 1.90 1.86 1.33 0.684 0.611 0.552 0.563 0.552 237 403. 3
4 0.111 0.0689 0.100 0.0740 0.0956 0.363 1.86 5.32 136 173. 4
5 5.73 0.492 0.142 0.105 0.127 0.170 0.299 0.710 47 112. 5
6 0.332 0.326 0.466 0.604 0.999 1.42 1.78 2.02 664 514. 6
7 0.449 0.497 0.829 1.32 1.47 1.37 1.07 1.01 444 424. 7
8 0.215 0.143 0.168 0.140 0.342 1.09 2.49 3.32 376 417. 8
# Visualise dynamics x cluster
function_ggparallel(data = ciona_cluster_avg_summary_mito %>% rename_with(.fn = ~str_replace(., "_byRowCol$", ""),.cols = ends_with("_byRowCol")),
col_names = 1:8,
group_col = "cluster",
plot_title = "Ciona mito: cluster dynamcis") +
geom_line(aes(color=cluster, size=as.factor(size))) +
scale_color_manual(values = ciona_cluster_palette) +
theme_pubr(base_family = "Avenir") +
facet_grid(~ cluster) +
theme(legend.position = "bottom") +
# Visualise each protein behavior x cluster
function_ggparallel(data = ciona_results_mito %>% rename_with(.fn = ~str_replace(., "_byRowCol$", ""),.cols = ends_with("_byRowCol")),
col_names = 6:13,
group_col = ".cluster",
plot_title = "Ciona mito: proteins dynamics x cluster") +
geom_line(aes(color=.cluster)) +
scale_color_manual(values = ciona_cluster_palette) +
theme_pubr(base_family = "Avenir") +
facet_grid(~ .cluster) +
theme(legend.position = "none") &
plot_layout(ncol = 1) & theme(plot.margin = unit(c(0.5, 0.5, 0, 0.5), "lines"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1)) & theme_pubr(legend = "none",base_family = "Avenir")# results_mito %>% pivot_longer(cols = c(6:13),
# names_to = "stage",
# values_to = "expression") %>%
# mutate(stage = str_remove(stage, "_byRowCol"),
# stage = fct_relevel(stage, ciona_stages)) %>%
# ggplot(aes(stage,expression, color = .cluster)) +
# geom_point(aes(group = Protein_Id_transcript)) +
# geom_line(aes(group = Protein_Id_transcript)) +
# facet_wrap(~.cluster)
# Boxplot all entries
plot_normGood_ciona <-
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%
filter(str_detect(stage, "_byRowCol")) %>%
mutate(stage = str_remove(stage, "_byRowCol")) %>%
ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +
# geom_line(aes(group = Protein_Id_transcript)) +
# geom_point(aes(color=stage)) +
geom_boxplot() +
scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
labs(title = "Normalised mitocarta good", y = "Developmental stages", x ="Relative abundance") +
theme_pubr(base_family = "Avenir") + theme(legend.position = "none") + theme_pubr(legend = "none",base_family = "Avenir")
plot_original_ciona + plot_normRowCol_ciona + plot_normGood_ciona & theme_pubr(legend = "none",base_family = "Avenir")1.0.2.2 Ribosomes
Normalising with ribosomes yield simialr results
ciona_ribosome <- as_tibble(read_excel(path = paste0(directory,"a.reference_annotation/Supp Table - KY21_info.xlsx"),col_names = T,trim_ws = T,sheet = "ribosome_prot"))
glimpse(ciona_ribosome)Rows: 3,307
Columns: 3
$ GeneID <chr> "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.754", "KY21.Chr1.754", "KY21.Chr1.754", "KY21.Chr1.770", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.795", "KY21.Chr1.795", "KY21.Chr1.797", "KY21.Chr1.824", "KY21.Chr1.884", "KY21.Chr1.904", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.1015", "KY21.Chr1.1071", "KY21.Chr1.1071", "KY21.Chr1.1072", "KY21.Chr1.1072", "KY…
$ TranscriptID <chr> "KY21.Chr1.702.v1.SL1-1", "KY21.Chr1.702.v1.SL2-1", "KY21.Chr1.702.v1.SL3-1", "KY21.Chr1.702.v1.nonSL4-1", "KY21.Chr1.702.v1.nonSL5-1", "KY21.Chr1.705.v1.ND1-1", "KY21.Chr1.705.v1.SL2-1", "KY21.Chr1.705.v1.SL3-1", "KY21.Chr1.705.v1.SL4-1", "KY21.Chr1.705.v1.SL5-1", "KY21.Chr1.705.v1.SL6-1", "KY21.Chr1.705.v1.SL7-1", "KY21.Chr1.705.v1.SL8-1", "KY21.Chr1.720.v1.nonSL1-1", "KY21.Chr1.720.v1.nonSL2-1", "KY21.Chr1.720.v1.nonSL3-1", "KY21.Chr1.720.v1.nonSL4-1", "KY21.Chr1.720.v1.nonSL5-1", "KY21.Chr1.754.v1.SL1-1", "KY21.Chr1.754.v1.nonSL2-1", "KY21.Chr1.754.v1.nonSL1-1", "KY21.Chr1.770.v1.ND1-1", "KY21.Chr1.791.v1.SL1-1", "KY21.Chr1.791.v1.nonSL2-1", "KY21.Chr1.791.v1.nonSL3-1", "KY21.Chr1.791.v2.SL1-2", "KY21.Chr1.791.v2.nonSL2-2", "KY21.Chr1.791.v2.nonSL3-2", "KY21.Chr1.791.v3.SL1-3", "KY21.Chr1.791.v3.nonSL2-3", "KY21.Chr1.791.v3.nonSL3-3", "KY21.Chr1.791.v4.SL1-4", "KY21.Chr1.791.v4.nonSL2-4", "KY21.Chr1.791.v4.nonSL3-4", "KY21.Chr1.791.v5.SL1-5", "KY21.Chr1.791.v…
$ annotation <chr> "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "…
# Annotated ribosomes
ciona_TMTproC_rowMean_ribosome.df <-
ciona_TMTproC_rowMean.df %>%
dplyr::select(c(1,4:11)) %>%
right_join(ciona_ribosome, by = c("Protein_Id_transcript" = "TranscriptID")) %>%
dplyr::select(-GeneID) %>%
rename_with(
.fn = ~str_replace(., "_by_rowMean$", ""),
.cols = ends_with("_by_rowMean")) %>%
filter_at(vars(2:9), all_vars(. <= 2)) # fc less than 2
ciona_TMTproC_rowMean_ribosome.df %>% group_by(annotation) %>% tally() # 314 ribosome associated proteins ==> 298 with cut off # A tibble: 1 × 2
annotation n
<chr> <int>
1 ribo 298
# write.table("HGNCmitocarta.dynamics.df.txt", quote=F, col.names=T, row.names=F, sep="\t")
###############################
#### Normalising using the median column of stable ribosomes proteins
###############################
# Median by column
ciona_colMedian_normVector_Ribo <-
ciona_TMTproC_rowMean_ribosome.df %>%
dplyr::select(2:9) %>%
ungroup() %>%
summarize_all(median)
# divide each column of df by the corresponding column of vector median
ciona_normRibo <- ciona_TMTproC_rowMean.df #%>% drop_na()
ciona_normRibo[, 4:11] <- mapply('/', ciona_normRibo[, 4:11], ciona_colMedian_normVector_Ribo)
colnames(ciona_normRibo) <- c("Protein_Id_transcript", "Protein_Id_Gene", "Number_of_peptides", "unfE_byRowCol", "fertE_byRowCol", "cell16_byRowCol", "iniG_byRowCol", "latN_byRowCol", "midTII_byRowCol", "latTII_byRowCol", "larva_byRowCol")
###############################
#### Kmeans dynamics of Ciona protein dataset with ribosomes normalisation
###############################
set.seed(777) # reproduce the clusters
# Fit and predict clusters with k = 8
ciona_kmeans_ribo <- kmeans(ciona_normRibo[,4:11], centers = 8, nstart = 100, iter.max = 1000)
# clean up the clustering and add cluster prediction to the original data set
ciona_results_ribo <- augment(ciona_kmeans_ribo, ciona_normRibo)
# summarizes per-cluster level info
ciona_cluster_avgSummary_ribo <- tidy(ciona_kmeans_ribo)
ciona_cluster_avgSummary_ribo# A tibble: 8 × 11
unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol size withinss cluster
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
1 2.03 1.97 1.32 0.676 0.593 0.549 0.550 0.548 233 431. 1
2 0.793 0.784 0.859 0.942 1.01 1.09 1.22 1.28 2023 427. 2
3 1.06 1.05 1.07 1.03 1.01 0.979 0.949 0.922 3142 570. 3
4 0.117 0.0732 0.101 0.0764 0.0958 0.364 1.77 5.01 136 159. 4
5 0.349 0.344 0.473 0.627 1.01 1.43 1.69 1.90 666 499. 5
6 0.226 0.152 0.169 0.144 0.345 1.09 2.37 3.13 378 407. 6
7 0.471 0.526 0.838 1.34 1.46 1.37 1.03 0.969 470 440. 7
8 6.02 0.524 0.143 0.108 0.128 0.171 0.285 0.670 47 116. 8
# ciona_cluster_avgSummary_ribo %>% filter(cluster == 5)
ciona_cluster_avgSummary_ribo %>%
pivot_longer(cols = c(1:8), names_to = "stage", values_to = "expression") %>%
mutate(stage = str_remove(stage, "_byRowCol")) %>%
mutate(stage = fct_relevel(stage, ciona_stages), cluster = factor(cluster)) %>%
ggplot(aes(stage, expression, color = cluster)) +
geom_point() +
geom_line(aes(group = interaction(cluster, size), size = as.factor(size))) +
scale_color_manual(values = ciona_cluster_palette) +
theme(legend.position = "bottom", plot.title = element_text(size = 10)) +
labs(title = "Ribosome normalization", x = "") & theme_pubr(legend = "bottom",base_family = "Avenir") # scale_size_discrete(name = "Cluster size") + # rename the size legend
# guides(color = guide_none())1.0.3 Comparing normalization process
###############################
#### cluster plots
###############################
plot_std_norm <- plot_cluster_norm(ciona_cluster_avgSummary, "Before good normalization") + scale_color_manual(values = c("1" = "#AB3329FF", "2" = "#88A0DCFF", "3" = "#ED968CFF","4" = "#381A61FF", "5" = "#E78429FF", "6" = "#7FA074FF","7" = "#7C4B73FF", "8" = "#F9D14AFF"))
plot_mito_norm <- plot_cluster_norm(ciona_cluster_avg_summary_mito, "Mitocarta normalization")
plot_ribo_norm <- plot_cluster_norm(ciona_cluster_avgSummary_ribo, "Ribosome normalization") +
scale_color_manual(values = c("1" = "#7FA074FF", "2" = "#7C4B73FF", "3" = "#88A0DCFF","4" = "#E78429FF", "5" = "#381A61FF", "6" = "#ED968CFF","7" = "#F9D14AFF", "8" = "#AB3329FF"))
top <- plot_std_norm + plot_mito_norm + plot_ribo_norm
###############################
#### data pivot and linear regression
###############################
# function for cluster plots
ciona_results_std_pivot <- cluster_pivot_data(ciona_results, 2, c(1,4:11))
ciona_results_mito_pivot <- cluster_pivot_data(ciona_results_mito, 1, c(1,6:13))
ciona_results_ribo_pivot <- cluster_pivot_data(ciona_results_ribo, 3, c(1,4:11))
# Plot linear regression for each normalization method
bottom <-
plot_cluster_lr(ciona_results_std_pivot, "Standard normalisaton", FALSE, "B1") +
plot_cluster_lr(ciona_results_mito_pivot, "Mitocarta normalisaton", FALSE,"B2") +
plot_cluster_lr(ciona_results_ribo_pivot, "Ribosome normalisaton", FALSE, "B3") & theme_pubr(base_family = "Avenir")
top / bottom + plot_annotation(title = 'Normalization comparison') + plot_annotation(tag_levels = 'A') &
theme(text = element_text('Avenir'), plot.title = element_text(size = 30), plot.tag = element_text(size = 15)) 1.0.4 Relative distribution
Counts per protein scaled to integrate to 1. This is useful, to compare the proteome between species.
###############################
#### reshaping the data to span 0-1 and/or to the number of multiplext TMT channels (8 for ciona; 10 in THao et al.;11 in Itallie et al.)
###############################
# Sum of channels/ stages is 8 (8 channels - 8 development stages)
ciona_proteome <-
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
rename_with(~ str_remove(., "_byRowCol")) %>%
rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") |>
ungroup()
ciona_proteome_normChan_tidy_wide <-
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
rename_with(~ str_remove(., "_byRowCol")) %>%
rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") |>
rowwise() %>%
mutate(across(c(6:13), ~ .x / sum(c_across(6:13))) * 8) %>%
ungroup()
# mutate(row_sum = rowSums(across(c(6:13)), na.rm = TRUE)) # checking rowSum protein is 1
# Sum of channels/stages is 1
ciona_proteome_normTo1_tidy_wide <- ciona_proteome_normChan_tidy_wide |>
# dplyr::select(-number_of_peptides) |>
mutate(across(c(6:13), ~./8 ))
# mutate(row_sum = rowSums(across(c(6:13)), na.rm = TRUE)) # checking rowSum protein is 1
write.table(ciona_proteome_normTo1_tidy_wide, paste0(directory_save, "/ciona_proteome_normTo1.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")
write.table(ciona_proteome_normChan_tidy_wide, paste0(directory_save, "/ciona_proteome_normToChan.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")Checking that the pattern is the same
# Picking the protein Thr
a1 <- ciona_proteome |>
filter(human_id =="THRB") |>
pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |>
mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |>
ggplot(aes(x = stage, y = value)) +
geom_point() +
labs(title = "Scatterplot of Protein Expression Across Stages",
x = "Stage",
y = "Expression Level") +
theme_pubr(base_family = "Avenir")
a2 <- ciona_proteome_normChan_tidy_wide |>
filter(human_id =="THRB") |>
pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |>
mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |>
ggplot(aes(x = stage, y = value)) +
geom_point() +
labs(title = "Scatterplot of Protein Expression Across Stages",
x = "Stage",
y = "Expression Level") +
theme_pubr(base_family = "Avenir")
a3 <- ciona_proteome_normTo1_tidy_wide |>
filter(human_id =="THRB") |>
pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |>
mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |>
ggplot(aes(x = stage, y = value)) +
geom_point() +
labs(title = "Scatterplot of Protein Expression Across Stages",
x = "Stage",
y = "Expression Level") +
theme_pubr(base_family = "Avenir")
a1 / a2 /a3 # & coord_fixed(ratio = 1)
# a1 / a2 /a3 & coord_equal()